/*********************************************************************
Purpose: Merge and prepare phone survey and baseline covariates for analysis.
*********************************************************************/

clear all
clear matrix
set more off
set linesize 255
pause on


use "$BASELINE_DATA_OUT/BaselineCovariates.dta", clear 	

// 	Generate variables for APCD costs
** Capital costs pulled from this spreadsheet: https://docs.google.com/spreadsheets/d/1UNsdN85sn891vtabzV8pF1T76U5445cs5ZPWwV2OPDE/edit#gid=0
** (a) Why do 14 plants have 0 boiler cap? --> powered by TPH and HAGs (b) Why does one plant (GJS_334) have 183 boiler cap? --> accurate
** follow-up question: how to impute costs for these cases? We set 0 to missing, and group 183 into the largest bucket.

* create an indicator for boiler cap thresholds per spreadsheet
gen plant_boi_cap_size = 0
replace plant_boi_cap_size = . if plant_boi_cap == 0
replace plant_boi_cap_size = 1 if plant_boi_cap > 0 & plant_boi_cap < 4 // spreadsheet said lower bound is 1, leave 0s as missing
replace plant_boi_cap_size = 2 if plant_boi_cap >= 4 & plant_boi_cap < 8
replace plant_boi_cap_size = 3 if plant_boi_cap >= 8 & plant_boi_cap < 15
replace plant_boi_cap_size = 4 if plant_boi_cap >= 15 & plant_boi_cap < 25
replace plant_boi_cap_size = 5 if plant_boi_cap >= 25 & plant_boi_cap < 185 // spreadsheet said upper bound is 90, but we have one 183 value

tab plant_boi_cap_size // size 1: 33, size 2: 142, size 3: 109, size 4: 12, size 5: 6. 

* import spreadsheet cost matrix
local cost_cyclone 4 5.5 6.5 8 8
local cost_bagfilter 3.75 9 11.5 12 . 
local cost_scrubber 6 9.5 15 15 .
local cost_esp 45 60 100 135 225

* impute costs based on cost matrix
foreach apcd in cyclone bagfilter scrubber esp {

	* initialize cost variables
	gen apcd_unitcost_install_`apcd' = .

	* loop over each apcd and all five costs
	local i = 1
	foreach val in `cost_`apcd'' {
		replace apcd_unitcost_install_`apcd' = `val' if plant_boi_cap_size==`i' 
		label var apcd_unitcost_install_`apcd' "Unit Installation Cost for each `apcd' (Rs)"
		local i = `i'+1
	}
}

// Create placeholders for maint and var costs, until we get them
// Jahnavi (previously Rohini's PhD student) said FICCI suggested using 3% and 6% resp. for operations and maintenance costs. See appendix Table C.3 in her dissertation:
*https://dash.harvard.edu/bitstream/handle/1/40050139/NILEKANI-DISSERTATION-2018.pdf?sequence=4

foreach apcd in cyclone bagfilter scrubber esp {
	* generate variables for APCD maintenance costs
	gen apcd_unitcost_maint_`apcd' = 0.06 * apcd_unitcost_install_`apcd'
	label var apcd_unitcost_maint_`apcd' "Maintenance Cost (annual) for each `apcd' (Rs)"
	* generate variables for APCD operating costs
	gen apcd_unitcost_ope_`apcd' = 0.03 * apcd_unitcost_install_`apcd'
	label var apcd_unitcost_ope_`apcd' "Operating Cost (annual) for each `apcd' (Rs)"
}

keep ///
	gpcb_id /// Unique plant ID
	industry_name /// Plant name
	treatmentstatus /// Treatment status
	plant_total_heatoutput /// Total heat output (boiler tph equivalent)
	pm_conc_etsbl /// Baseline emissions concentration (mg/Nm3)
	pm_mass_etsbl /// [**per hour] Baseline emissions load (kg/hr) (where available in pre-experiment period)
	D_cyc D_bf D_scr D_esp /// APCD installation dummies as of baseline: Cyclone, bag filter, scrubber, ESP 
	cyc_max bf_max scr_esp_max /// Maximal APCD device
	apcd_unitcost_install_* /// APCD capital costs for each equipment (from engineering estimates)
	num_cyclones num_bagfilters num_scrubbers num_esps /// 
	apcd_unitcost_maint_*  /// APCD maintenance costs for each equipment (from engineering estimates as 6% of installation)
	apcd_unitcost_ope_* // APCD operating costs for each equipment (from engineering estimates as 3% of installation)

// Convert costs into lakh rupees
ds *_unitcost_*
foreach var in `r(varlist)' {
	replace `var' = 100000*`var'
}

// Rename and label variables for output
rename industry_name gpcb_name
label var gpcb_id "Unique plant ID"
gen D_treatment = (treatmentstatus=="Treatment")
label var D_treatment "Plant treatment status"
drop treatmentstatus

rename plant_total_heatoutput heatoutput
rename pm_conc_etsbl emissions_conc_etsbl
rename pm_mass_etsbl emissions_mass_etsbl

rename D_cyc apcd_present_cyclone
rename D_bf apcd_present_bagfilter
rename D_scr apcd_present_scrubber
rename D_esp apcd_present_esp

rename cyc_max apcd_maximal_cyclone
rename bf_max apcd_maximal_bagfilter
rename scr_esp_max apcd_maximal_scrubber_esp

rename num_cyclones apcd_count_cyclone
rename num_bagfilters apcd_count_bagfilter
rename num_scrubbers apcd_count_scrubber
rename num_esps apcd_count_esp

order gpcb_id gpcb_name D_treatment heatoutput emissions_conc_etsbl emissions_mass_etsbl apcd_present_* apcd_maximal_* apcd_count_* apcd_unitcost_install_* apcd_unitcost_maint_* apcd_unitcost_ope_* 

tempfile data
save `data'

//	Create dummy if plant is in analysis panel

use "$EMISSIONS_DATA_OUT/Rule0_Panel.dta", clear
keep gpcb_id
quietly bysort gpcb_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup

merge 1:1 gpcb_id using `data', gen(D_analysis)
label define D_analysis 0 "Excluded in analysis" 1 "Included in analysis"
replace D_analysis = 0 if D_analysis==2
replace D_analysis = 1 if D_analysis==3
label values D_analysis D_analysis
label var D_analysis "=1 if included in emissions analysis"

order D_analysis, after(D_treatment)

save "$PHONE_DATA_OUT/apcd_panel_plant.dta", replace
